/* generation of an additional data set, where N is kept constant, but the word order is randomized per book instead of verse
*/


/*
program to mask the word structure information
*/
qui {
capture program drop mask_it
program mask_it
	args word
	tempfile TEMP1 TEMP2
	tempvar MATCH LENGTH
	gen sortorder=_n
	save `TEMP1', replace
	gen `MATCH'=1
	collapse (sum) `MATCH', by(`word') fast

	drop `MATCH'
	gen identifier=_n
		
	gen `LENGTH'=ustrlen(`word')
	expand `LENGTH'
	sort identifier
	
	bysort id: gen charid=_n
	
	gen char=usubstr(usubstr(`word',charid,charid+1),1,1)
	
	preserve
	gen `MATCH'=1
	collapse (sum) `MATCH', by(char) fast
	drop if ustrregexm(char,"[0-9]")
					
	gen random=runiform()
	so random
	
	gen char_shuffler=_n
	keep char char_shuffler
	ren char char_shuffled
	local char_sum=_N
	save `TEMP2', replace
	restore	
	
	gen char_shuffler=runiformint(1,`char_sum')
	merge m:m char_shuffler using `TEMP2'
	sort identifier charid
	replace char_shuffled=char if ustrregexm(char,"[0-9]") 
	
	sum `LENGTH'
	local to=r(max)
	forvalues c=`to'(-1)1 {
		if `c'==`to' {
		gen str char`c'=char_shuffled if charid==`c' 
	}
		else {
			local plusone=`c'+1
			gen char`c'=char_shuffled+char`plusone'[_n+1] if charid==`c'		
		}
		}
	
	keep if charid==1
	keep `word' char1
	
	/* one character words are not masked */
	replace char1=`word' if ustrlen(char1)==1
	
	/* test for duplicated masks */
	bysort char1: gen id=_n
	replace char1=ustrreverse(char1) if id>1
	drop id
	
	
	merge 1:m `word' using `TEMP1', nogenerate
	sort sortorder
	
	rename char1 `word'_masked
	drop sortorder
end
}
*********************************************
local g=`1'
local dir `c(pwd)'
qui {
clear
set obs 1
foreach type in original order structure {
gen entropy_`type'=.
gen words_`type'=.
gen chars_`type'=.
}
gen book=.             
gen str translation=""	                        
save  entropy_bible_constant_`g', replace

use calculation_`g', clear

local till=_N
local counter=_N

forvalues i=1/`till' {       
	use calculation_`g', clear
	local translation=translation[`i']
	local b=book[`i']
	local counter=`counter'-1
	local min=min[`i']
	foreach type in original structure order {
					filefilter "`dir'\length\\`translation'_`b'_original.txt" text_`g'.txt, from(`" "') to(\r\n) replace
					
					import delimited text_`g'.txt, delimiter(tab) varnames(nonames) stripquote(no) bindquotes(nobind) case(preserve) encoding(UTF-8) clear  
					ren v1 word
					keep if _n<=`min'
					local words_`type'=_N					
					if "`type'" =="structure"  {
						mask_it word
						keep word_masked
					}
					if "`type'" =="order" {
						gen random=runiform()
						sort random
					}					
					outsheet word using "text_`g'.txt", nonames noquote replace
					
					filefilter text_`g'.txt text2_`g'.txt, from(\r\n) to(`" "') replace
					!java -Xmx2000M -jar "`dir'\\shortestmismatcher.jar" "`dir'\\text2_`g'.txt" "`dir'\\text3_`g'.txt"
					import delimited `"`dir'\\text3_`g'.txt"', delimiter(tab) varnames(nonames) stripquote(no) bindquotes(nobind) case(preserve) encoding(UTF-8) clear  
					capture erase text3_`g'.txt
					local chars_`type'=_N
					ren v2 li
					gen i=_n
					local n=_N
					gen tosum=(li)/(log(i)/log(2))
					sum tosum
					local entropy_`type'=r(sum)
					}	
					use  entropy_bible_constant_`g', clear
					local new=_N+1
					set obs `new'
					local in `"in `new'"'
					foreach type in original order structure {
							replace entropy_`type'=`entropy_`type'' 	`in'
							replace chars_`type'=`chars_`type'' 			`in'
							replace words_`type'=`words_`type'' 			`in'
							} 
					replace book=`b' `in'		       		
					replace translation="`translation'"	          	`in'  
					save  entropy_bible_constant_`g', replace
			noisily di "`translation' `b' (left: `counter') " 
		}
}

/* cleaning up */

foreach file in text_`g'.txt text2_`g'.txt text3_`g'.txt {
		capture erase `file'
	}
/* generate finished file */
clear
set obs 1
gen v=1
save finished_`g', replace

clear         
exit, STATA 	
